# This script is for calculate the best subset method
# 选取最优子集数为2的最佳子集

library('leaps')
load('train.data.normal.rda')
subset<-regsubsets(data=train.data.normal,lpsa~.)
summary(subset)

# 可以看到最优子集数为2的子集是(lcavol,lweight)
# 用lm做回归，得到结果
linear.model.subset <- lm(data=train.data.normal,lpsa~lcavol+lweight)
summary(linear.model.subset)
# 结果与书P63页表3.3一致

# 计算检验误差和标准误差(Test Error and Std Error)
load("test.data.normal.rda")
a<-rep(1,length(test.data.normal[,1]))
X.test<-cbind(Intercept=a,test.data.normal[,c(1,2)])# 1 for lcavol;2 for lweight
Y.test<-as.matrix(X.test)%*%as.matrix(linear.model.subset$coefficients)
Y.true<-as.matrix(test.data.normal[,length(test.data.normal)])
Test.Error<-sum((Y.test-Y.true)*(Y.test-Y.true))/length(Y.test)
#Std.Error<-sum((Y.test-Y.true)*(Y.test-Y.true))/(length(Y.test)-3)/sqrt(length(Y.test)-4)
# 标准误差的计算依然有问题！！不再考虑标准误差,因为不影响分析,可以用检验误差去做标准! 
